In [ ]:
What's for those recently in confucian wikipedias, what's
In [3]:
import pandas as pd
import numpy
from collections import defaultdict
import json
import statsmodels.api as sm
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
WIKIS =('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'dewiki', 'enwiki','kowiki')
In [4]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
if type(q_str) is float:
if numpy.isnan(q_str):
return q_str
if type(q_str) is str:
qs = q_str.split('|')
return qs[:-1] #cos the format will always end with a |
for col in ['gender','site_links']:
allrecs[col] = allrecs[col].apply(split_column)
In [5]:
def has(xxwiki):
def has_xx(row):
if isinstance(row['site_links'], list):
return xxwiki in row['site_links']
else: return False
return has_xx
generate files
In [6]:
allrecs.head(20)
Out[6]:
In [40]:
def makedecades(b,e):
for y in range(b, e, 10):
yield y, y+10
def isfemale(x):
if isinstance(x, list):
return x[0] == 'Q6581072'
else: return False
def ismale(x):
if isinstance(x, list):
return x[0] == 'Q6581097'
else: return False
def nogender(x):
if not isinstance(x, list):
return True
else: return False
for xxwiki in WIKIS:
has_wiki = has(xxwiki)
recs = allrecs[allrecs.apply(has_wiki, axis=1)]
for gender, gender_test in (('female', isfemale), ('male', ismale), ('nogender', nogender)):
grecs = recs[recs['gender'].apply(gender_test)]
for start_year, stop_year in makedecades(1930,1990):
modrecs = grecs[(grecs['dob'] >= start_year) &(grecs['dob'] < stop_year)]
#print len(modrecs), xxwiki, start_year
filepath = 'helpers/inspection/{}_{}_{}.json'.format(xxwiki, start_year,gender)
json.dump(list(modrecs['qid']), open(filepath,'w'))
!scp $filepath wmflabs-tools:/home/maximilianklein/inspectionshortcut
then wait for the remote 200word summary task on labs under viafbot/inspection
In [42]:
!mkdir helpers/inspection/expanded_descriptions
In [78]:
!scp wmflabs-tools:/home/maximilianklein/inspectionshortcut/output/* helpers/inspection/expanded_descriptions/.
In [7]:
description_files = !ls helpers/inspection/expanded_descriptions/
len(description_files)
Out[7]:
In [8]:
celebrity_dict = {'jawiki': [u'俳優', u'選手', u'歌手', u'ミュージシャン', u'モデル', u'アイドル'],
'zhwiki': [u'演員', u'運動員', u'歌手', u'音乐家', u'模特兒', u'偶像'],
'tlwiki': [u'artista', 'aktor', u'player', u'mang-aawit', u'musikero', u'modelo', u'idolo'],
'urwiki': [u'اردو', u'کھلاڑ', u'گلوکار' , u'موسیقار' , u'ماڈل', u'بت'],
'dewiki': [u'schauspieler' , u'spieler', u'Musiker', u'Sänger', u'Modell', u'Idol'],
'enwiki' :[u'actor', u'actress', u'player', u'singer', u'musician', u'model', u'idol'],
'kowiki' : [u'배우', u'선수', u'가수', u'음악가', u'모델', u'우상']}
def intext(text, xxwiki):
if text:
text = text.encode('utf-8').lower()
engwords = celebrity_dict['enwiki']
foreignwords = celebrity_dict[xxwiki]
for word in engwords + foreignwords:
if word.encode('utf-8').lower() in text:
return True
#if we get to this point its too late
return False
else: return False
celebdf = pd.DataFrame(columns=['wiki','decade','gender','celeb_per'])
for f in description_files:
parts = f.split('_')
xxwiki, decade, gender = parts[0], parts[1], parts[2]
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/expanded_descriptions/{}'.format(f), 'r')), orient='index')
df['celeb'] = df['text'].apply(lambda text: intext(text,xxwiki))
test_per = df['celeb'].sum()/float(len(df))
celebdf = celebdf.append({'wiki':xxwiki, 'decade':int(decade), 'gender':gender, 'celeb_per':test_per}, ignore_index=True)
In [4]:
dummy_langs = pd.get_dummies(celebdf['wiki'])
dummy_gender = pd.get_dummies(celebdf['gender'])
dummy_gender = dummy_gender[['male','female','nogender']]
print dummy_langs.head(2)
print dummy_gender.head(2)
In [5]:
catdf = celebdf[['celeb_per','decade']].join(dummy_langs.ix[:,'enwiki':]).join(dummy_gender.ix[:,'female':'female'])
catdf['intercept'] = 1.0
In [6]:
catdf.corr()
Out[6]:
In [7]:
train_cols = catdf.columns[1:]
logit = sm.Logit(catdf['celeb_per'], catdf[train_cols])
result= logit.fit()
In [19]:
result.summary()
Out[19]:
In [17]:
subj_list = ['female','male','nogender']
fig, axes = plt.subplots(nrows = 1, ncols = len(subj_list), sharex='col', sharey='row')
for ax, subj in zip(axes, subj_list):
natlangdf = celebdf[celebdf['gender'] == subj]
natlangpiv = pd.pivot_table(natlangdf, values='celeb_per', rows='decade', cols='wiki')
natlangpiv = natlangpiv[['jawiki','zhwiki','kowiki','tlwiki','urwiki','dewiki','enwiki']]
natlangpiv.columns = ['Japanese', 'Chinese', 'Korean', 'Tagalog', 'Urdu', 'German', 'English']
natlangpiv = natlangpiv * 100
heatmap = ax.pcolor(natlangpiv, cmap='Purples', vmin=0, vmax=100)
ax.set_yticks(np.arange(0.5, len(natlangpiv.index), 1))
ax.set_yticklabels(map(int, natlangpiv.index))
ax.set_xticks(np.arange(0.5, len(natlangpiv.columns), 1))
ax.set_xticklabels(natlangpiv.columns, rotation=90)
fig.suptitle('''Heatmap of Celebrity Biography %, By Decade of Birth versus Wikipedia Language by Gender''', fontsize=18)
fig.set_size_inches(12,4,dpi=600)
#fig.tight_layout()
subj_titles = ['Female','Male','Not Recorded or Non-Binary']
metric_titles =['Decade']
cbar = plt.colorbar(mappable=heatmap, ax=ax, format="%.0f%%")
for i in range(len(subj_titles)):
axes[i].set_title(subj_titles[i])
fig.subplots_adjust(wspace=0.0, hspace=0.0, top=0.85)
In [17]:
subplots_adjust
Out[17]:
In [11]:
actress_dict = {'jawiki': u'俳優', 'zhwiki': u'演員', 'tlwiki':u'artista', 'urwiki': u'اردو', 'dewiki': 'schauspieler' , 'enwiki' :'actress'}
player_dict = {'jawiki': u'選手', 'zhwiki': u'運動員', 'tlwiki':u'player', 'urwiki': u'کھلاڑ', 'dewiki': 'spieler' , 'enwiki' :'player'}
def multiword(xxwiki, prof_dict):
def intext(text):
if text:
text = text.lower()
eng = prof_dict['enwiki']
foreign = prof_dict[xxwiki]
if eng in text or foreign in text:
return True
else: return False
else: return False
return intext
celeb = defaultdict(dict)
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
for prof, test in (('actress', multiword(xxwiki, actress_dict)), ('player',multiword(xxwiki, player_dict))):
df[prof] = df['text'].apply(test)
test_per = df[prof].sum()/float(len(df))
celeb[xxwiki][prof] = test_per
In [8]:
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
df['text'] = df['text'].apply(lambda x: x.replace('\n',' ') if x else x)
df.to_csv('helpers/inspection/readable/{}_modern_bios_for_inspection.csv'.format(xxwiki), encoding='utf-8')
In [18]:
celebdf = pd.DataFrame.from_dict(celeb, orient='index')
celebdf['either'] = celebdf['player'] + celebdf['actress']
celebdf.sort('either')
Out[18]:
In [19]:
celebdf['wiki'].convert_object(convert_dates=True)
In [14]:
celebdf
Out[14]:
In [15]:
natlangpiv = pd.pivot_table(celebdf, values='celeb_per', rows='decade', cols='wiki')
In [16]:
natlangpiv
Out[16]:
In [ ]: